####################################
# Title - Event-Based Framing of Democracy in American News Media
# Date - May 28th, 2024
# Goal - Find nns of democracy
####################################

rm(list=ls())

  # Library
  library(quanteda)
  library(dplyr)
  library(text2vec)
  library(conText)
  library(ggplot2)
  library(lubridate)
  
  # Dataset
  data<-readRDS("data/public_nyt_cleaned_articles.rds")
  glove_vectors <- readRDS("data/glove.rds") # Embedding model is already trained by researchers at Stanford
  transform_vectors<-readRDS("data/khodakA.rds")

#########################
#
# Preprocessing
#
#########################

  data$pub_month_factor<-as.factor(data$pub_month)
  data$pub_year_factor<-as.factor(data$pub_year)
  data$pub_year<-as.numeric(data$pub_year)
  data$global<-1
  
  text_corpus <- corpus(data,
                        docid_field = "docid",
                        text_field = "clean_text",
                        meta = T)

  #Describe the corpus
    summary_corpus<-summary(text_corpus, n = nrow(data), showmeta = TRUE)

    summary(summary_corpus$Tokens)
    quantile(summary_corpus$Tokens, c(.1, .9))

    summary(summary_corpus$Sentences)
    quantile(summary_corpus$Sentences, c(.1, .9))

    count<-as.data.frame(table(summary_corpus$pub_date))
    summary(count$Freq)

    case_dem <- grep(pattern ="democracy|Democracy|democracies|Democracies|democracy's|Democracy’s|democracies'|Democracies'|democratic",
                     data$full_text, ignore.case = FALSE)
    length(case_dem)

  # tokenize corpus removing unnecessary (i.e. semantically uninformative) elements
  toks <- tokens(text_corpus, remove_punct=T, remove_symbols=T, remove_numbers=F, remove_url = T, remove_separators=T)

  # clean out stopwords and words with 2 or fewer characters
  toks_nostop <- tokens_select(toks, pattern = stopwords("en"), selection = "remove", min_nchar=3)

  # only use features that appear at least 5 times in the corpus
  feats <- dfm(toks_nostop, tolower=T, verbose = FALSE) %>% dfm_trim(min_termfreq = 5) %>% featnames()

  # check spelling. toupper avoids names being considered misspelled
  if (requireNamespace("hunspell", quietly = TRUE)) {
    library(hunspell) # spell check library
    spellcheck <-  hunspell_check(toupper(feats), dict = hunspell::dictionary("en_US"))
    feats <- feats[spellcheck]
  }

  # leave the pads so that non-adjacent words will not become adjacent
  toks_nostop_feats <- tokens_select(toks_nostop, feats, padding = TRUE)


  # build a tokenized corpus of contexts surrounding the target term "democracy"
  demo_toks <- tokens_context(x = toks_nostop_feats, pattern = c("democracy", "Democracy", "DEMOCRACY",
                                                                 "democracies", "Democracies", "DEMOCRACIES",
                                                                 "democracy's", "Democracy's", "DEMOCRACY's",
                                                                 "democratic"), window = 6L,
                              valuetype = "fixed", case_insensitive = FALSE, hard_cut = FALSE, rm_keyword = FALSE,
                              verbose = TRUE) #No regex which include "undemocracy
  rm(text_corpus, toks, toks_nostop, feats)

  # build document-feature matrix
  demo_dfm <- dfm(demo_toks)
  
  # build a document-embedding-matrix
  demo_dem <- dem(x = demo_dfm, 
                  pre_trained = glove_vectors, # pre-trained glovel model with 300-dim 
                  transform = TRUE, 
                  transform_matrix = transform_vectors, 
                  verbose = TRUE)
  
  # to get a single "corpus-wide" embedding, take the column average
  demo_wv <- matrix(colMeans(demo_dem), ncol = ncol(demo_dem)) %>%  `rownames<-`("democracy")
  dim(demo_wv)
  
##############
#
# Get Nearest neighbors with CI
#
##############
  
  feats <- featnames(dfm(demo_toks))
  
  #Global
    docvars(demo_toks)$global<-1
    set.seed(2021L)
    demo_global_nns <- get_nns(x = demo_toks,
                             N = 50,
                             groups = docvars(demo_toks, 'global'),
                             candidates = feats,
                             pre_trained = glove_vectors,
                             transform = TRUE,
                             transform_matrix = transform_vectors,
                             bootstrap = TRUE,
                             num_bootstraps = 100,
                             confidence_level = 0.95,
                             stem = FALSE,
                             as_list = TRUE)
    saveRDS(demo_global_nns, "data/demo2_global_nns.rds")
 
  #Year (without stem)
    set.seed(2021L)
    demo_year_nns <- get_nns(x = demo_toks,
                             N = 50,
                             groups = docvars(demo_toks, 'pub_year_factor'),
                             candidates = feats,
                             pre_trained = glove_vectors,
                             transform = TRUE,
                             transform_matrix = transform_vectors,
                             bootstrap = TRUE,
                             num_bootstraps = 100,
                             confidence_level = 0.95,
                             stem = FALSE,
                             as_list = TRUE)
    saveRDS(demo_year_nns, "data/demo2_year_nns.rds")
    
  #Year (with stem)
    set.seed(2021L)
    demo_year_nns_stem <- get_nns(x = demo_toks,
                             N = 50,
                             groups = docvars(demo_toks, 'pub_year_factor'),
                             candidates = feats,
                             pre_trained = glove_vectors,
                             transform = TRUE,
                             transform_matrix = transform_vectors,
                             bootstrap = TRUE,
                             num_bootstraps = 100,
                             confidence_level = 0.95,
                             stem = TRUE,
                             as_list = TRUE)
    saveRDS(demo_year_nns_stem, "data/demo2_year_nns_stem.rds")
  
  #Month (without stem)
  set.seed(2021L)
  demo_month_nns <- get_nns(x = demo_toks,
                            N = 50,
                            groups = docvars(demo_toks, 'pub_month_factor'),
                            candidates = feats,
                            pre_trained = glove_vectors,
                            transform = TRUE,
                            transform_matrix = transform_vectors,
                            bootstrap = TRUE,
                            num_bootstraps = 100,
                            confidence_level = 0.95,
                            stem = FALSE,
                            as_list = TRUE)
  saveRDS(demo_month_nns, "data/demo2_month_nns.rds")
  
  #Month (with stem)
  set.seed(2021L)
  demo_month_nns_stem <- get_nns(x = demo_toks,
                                 N = 50,
                                 groups = docvars(demo_toks, 'pub_month_factor'),
                                 candidates = feats,
                                 pre_trained = glove_vectors,
                                 transform = TRUE,
                                 transform_matrix = transform_vectors,
                                 bootstrap = TRUE,
                                 num_bootstraps = 100,
                                 confidence_level = 0.95,
                                 stem = TRUE,
                                 as_list = FALSE)
  saveRDS(demo_month_nns_stem, "data/demo2_month_nns_stem.rds")
  
  demo_global_nns      <-readRDS("data/demo2_global_nns.rds")
  demo_year_nns      <-readRDS("data/demo2_year_nns.rds")
  demo_year_nns_stem <-readRDS("data/demo2_year_nns_stem.rds")
  demo_month_nns      <-readRDS("data/demo2_month_nns.rds")
  demo_month_nns_stem <-readRDS("data/demo2_month_nns_stem.rds")
    
###############################################################
#
# Fine representative usages
#
###############################################################
    
    print(demo_global_nns[["1"]][1:50, 1:7], n=50)
  
    #Check the real usages
      list_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = demo_toks, N = 179107, as_list = FALSE)
      text_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = NULL, N = 179107, as_list = FALSE)
      
      list_ncs<-merge(list_ncs, text_ncs %>% select(-value, -target) %>% rename(text_num = context), by=c("rank"))
      
      top_nns_20 <-print(demo_global_nns[["1"]][1:20, 2], n=50)
      top_nns_20 <-top_nns_20 %>% filter(feature !="democracy"&feature !="democracies")
      
      result <-data.frame(words = NA,
                          example = NA)
      
      for(i in top_nns_20$feature){
        a<-grep(pattern = i, list_ncs$context, ignore.case = TRUE)
        if (length(a) > 0) {
          b <- data.frame(words = rep(i, length(a)),
                          example = a)
          result <- rbind(result, b)
        }
      }
      
      result<-result[-1,]
      list_ncs<-list_ncs[result$example,]
      list_ncs<-cbind(list_ncs, result) %>% select(-target, -rank)
      
      for (i in 1:nrow(list_ncs)){
        key<-gsub(pattern = "text", replacement = "", list_ncs$text[i], ignore.case = TRUE)
        key<-as.numeric(key)
        b<-demo_dem@docvars$full_text[key]
        list_ncs$real[i]<-b
      }
      
      list_ncs_top3 <- list_ncs %>% group_by(words) %>% arrange(desc(value)) %>% slice_head(n = 3)
      #saveRDS(list_ncs_top3, file = "validation_nns_democracy.rds")
      
###############################################################
#
# Visualization - Year
#
###############################################################  
      
    #No Stem
      a <- unique(data$pub_year_factor)
      result_df <- data.frame(target = NA,
                              feature = NA,
                              rank = NA,
                              value = NA,
                              std.error = NA,
                              lower.ci = NA,
                              upper.ci = NA)
  
      for (year_factor in a) {
        b <- demo_year_nns[[year_factor]]  # Assuming demo_month_nns is a list
        result_df <- rbind(result_df, b)
      }
      result_df<-result_df[-1,]
      result_df$stem<-FALSE
  
      result_df<-result_df %>% filter(rank<=15)
      table(result_df$feature)
    
  #Draw the graph with multiple keywords
    
    #All words should be lowercased
    target_word<-c("pluralist", "pluralism", "pluralistic",  "democratization", 
                   "authoritarianism", "socialism", "freedoms", "constitutionalism", 
                   "autocracy", "jeffersonian", "multiparty", "ideals", 
                   "secularism")
    set.seed(2021L)  
    target_cos_year<-get_cos_sim(x = demo_toks,
                                    groups = docvars(demo_toks, 'pub_year_factor'),
                                    features = target_word,
                                    pre_trained = glove_vectors,
                                    transform = TRUE,
                                    transform_matrix = transform_vectors,
                                    bootstrap = TRUE,
                                    num_bootstraps = 100,
                                    confidence_level = 0.95,
                                    stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                    as_list = FALSE)
    
    #Who changed the most
    target_cos_year %>% group_by(feature) %>% summarise(gap = value[22] - value[1])
    
    plots<-list()
    
    y_low <- min(target_cos_year$lower.ci)
    y_max <- max(target_cos_year$upper.ci)
    
    for(word in target_word){
      filtered_data<-target_cos_year %>% filter(feature==word)
      
      plot<-ggplot(filtered_data) +
        geom_line(aes(x = target, y = value), color = "black")+
        geom_line(aes(x = target, y = lower.ci), linetype="dashed")+
        geom_line(aes(x = target, y = upper.ci), linetype="dashed")+
        ylim(y_low, y_max) +
        xlab(word) + 
        ylab("") +
        theme_bw() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1))
      
      # Store the plot in the list
      plots[[word]] <- plot
    }
    
    library(ggpubr) #ggarange
    ggarrange(plotlist = plots[1:14], nrow = 4, ncol = 4)
    ggsave(filename = "nns_cosine_similarity_year.jpeg",
           width = 8, height = 8, dpi = 1000)
